library(tidyverse)
## ── Attaching packages ─────────────────────────────────────────────────────────── tidyverse 1.3.0 ──
## ✓ ggplot2 3.3.1 ✓ purrr 0.3.4
## ✓ tibble 3.0.1 ✓ dplyr 1.0.0
## ✓ tidyr 1.1.0 ✓ stringr 1.4.0
## ✓ readr 1.3.1 ✓ forcats 0.5.0
## ── Conflicts ────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(plotly)
##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
Descargar datos genómicos: datos
eukaryotes<-read_csv("data/eukaryotes.csv", col_names = T)
## Parsed with column specification:
## cols(
## `#Organism Name` = col_character(),
## `Organism Groups` = col_character(),
## Strain = col_character(),
## BioSample = col_character(),
## BioProject = col_character(),
## Assembly = col_character(),
## Level = col_character(),
## `Size(Mb)` = col_double(),
## `GC%` = col_double(),
## Replicons = col_character(),
## WGS = col_character(),
## Scaffolds = col_double(),
## CDS = col_double(),
## `Release Date` = col_datetime(format = ""),
## `GenBank FTP` = col_character(),
## `RefSeq FTP` = col_character()
## )
eukaryotes %>% glimpse()
## Rows: 5,131
## Columns: 16
## $ `#Organism Name` <chr> "Emiliania huxleyi CCMP1516", "Arabidopsis thaliana…
## $ `Organism Groups` <chr> "Eukaryota;Protists;Other Protists", "Eukaryota;Pla…
## $ Strain <chr> "CCMP1516", NA, "A17", NA, NA, NA, NA, NA, NA, "B80…
## $ BioSample <chr> "SAMN02744062", "SAMN03081427", "SAMN02299339", "SA…
## $ BioProject <chr> "PRJNA77753", "PRJNA10719", "PRJNA10791", "PRJNA198…
## $ Assembly <chr> "GCA_000372725.1", "GCA_000001735.1", "GCA_00021949…
## $ Level <chr> "Scaffold", "Chromosome", "Chromosome", "Chromosome…
## $ `Size(Mb)` <dbl> 167.67600, 119.66800, 412.92400, 978.97200, 823.786…
## $ `GC%` <dbl> 64.5000, 36.0528, 34.0470, 35.1208, 35.7097, 0.0000…
## $ Replicons <chr> NA, "chromosome 1:NC_003070.9/CP002684.1; chromosom…
## $ WGS <chr> "AHAL01", NA, "APNO01", "ACUP02", "AEKE02", "FJWB02…
## $ Scaffolds <dbl> 7795, 7, 2187, 1191, 3224, 72295, 58, 279439, 598, …
## $ CDS <dbl> 38554, 48350, 57661, 71525, 36010, 0, 41070, 0, 584…
## $ `Release Date` <dttm> 2013-04-19, 2001-08-13, 2011-08-12, 2010-01-05, 20…
## $ `GenBank FTP` <chr> "ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/000/372…
## $ `RefSeq FTP` <chr> "ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000/372…
eukaryotes %>% select(c("#Organism Name","Organism Groups","Strain","Size(Mb)","GC%","CDS")) -> eukaryotes_subset
eukaryotes_subset %>% head()
## # A tibble: 6 x 6
## `#Organism Name` `Organism Groups` Strain `Size(Mb)` `GC%` CDS
## <chr> <chr> <chr> <dbl> <dbl> <dbl>
## 1 Emiliania huxleyi CCM… Eukaryota;Protists;Othe… CCMP15… 168. 64.5 38554
## 2 Arabidopsis thaliana Eukaryota;Plants;Land P… <NA> 120. 36.1 48350
## 3 Medicago truncatula Eukaryota;Plants;Land P… A17 413. 34.0 57661
## 4 Glycine max Eukaryota;Plants;Land P… <NA> 979. 35.1 71525
## 5 Solanum lycopersicum Eukaryota;Plants;Land P… <NA> 824. 35.7 36010
## 6 Hordeum vulgare Eukaryota;Plants;Land P… <NA> 9789. 0 0
eukaryotes_subset %>% rename(organism="#Organism Name",groups="Organism Groups",genome_size_mb="Size(Mb)",gc_percent="GC%") %>% rename_with(tolower) -> eukaryotes_subset
eukaryotes_subset %>% head()
## # A tibble: 6 x 6
## organism groups strain genome_size_mb gc_percent cds
## <chr> <chr> <chr> <dbl> <dbl> <dbl>
## 1 Emiliania huxleyi… Eukaryota;Protists… CCMP15… 168. 64.5 38554
## 2 Arabidopsis thali… Eukaryota;Plants;L… <NA> 120. 36.1 48350
## 3 Medicago truncatu… Eukaryota;Plants;L… A17 413. 34.0 57661
## 4 Glycine max Eukaryota;Plants;L… <NA> 979. 35.1 71525
## 5 Solanum lycopersi… Eukaryota;Plants;L… <NA> 824. 35.7 36010
## 6 Hordeum vulgare Eukaryota;Plants;L… <NA> 9789. 0 0
eukaryotes_subset %>% separate(groups,c("domain","kingdom","class"),sep = ";") -> eukaryotes_subset
eukaryotes_subset %>% head()
## # A tibble: 6 x 8
## organism domain kingdom class strain genome_size_mb gc_percent cds
## <chr> <chr> <chr> <chr> <chr> <dbl> <dbl> <dbl>
## 1 Emiliania hux… Eukary… Protists Other … CCMP1… 168. 64.5 38554
## 2 Arabidopsis t… Eukary… Plants Land P… <NA> 120. 36.1 48350
## 3 Medicago trun… Eukary… Plants Land P… A17 413. 34.0 57661
## 4 Glycine max Eukary… Plants Land P… <NA> 979. 35.1 71525
## 5 Solanum lycop… Eukary… Plants Land P… <NA> 824. 35.7 36010
## 6 Hordeum vulga… Eukary… Plants Land P… <NA> 9789. 0 0
eukaryotes_subset %>% group_by(kingdom) %>% count()
## # A tibble: 5 x 2
## # Groups: kingdom [5]
## kingdom n
## <chr> <int>
## 1 Animals 1250
## 2 Fungi 2844
## 3 Other 20
## 4 Plants 478
## 5 Protists 539
eukaryotes_subset %>% group_by(kingdom) %>% summarise(mean=mean(genome_size_mb),sd=sd(genome_size_mb),max=max(genome_size_mb),min=min(genome_size_mb))
## `summarise()` ungrouping output (override with `.groups` argument)
## # A tibble: 5 x 5
## kingdom mean sd max min
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 Animals 1132. 1434. 32394. 0.663
## 2 Fungi 29.9 20.2 216. 2.08
## 3 Other 108. 129. 543. 12.1
## 4 Plants 1023. 2738. 27603. 0.986
## 5 Protists 46.7 57.1 808. 1.45
eukaryotes_subset %>% select(organism,kingdom,class,genome_size_mb) %>% group_by(kingdom) %>% top_n(.,n = 1,wt = genome_size_mb)
## # A tibble: 5 x 4
## # Groups: kingdom [5]
## organism kingdom class genome_size_mb
## <chr> <chr> <chr> <dbl>
## 1 Ambystoma mexicanum Animals Amphibians 32394.
## 2 Pinus lambertiana Plants Land Plants 27603.
## 3 Saccharina japonica Other Other 543.
## 4 Uromyces viciae-fabae Fungi Basidiomycetes 216.
## 5 Symbiodinium microadriaticum Protists Other Protists 808.
eukaryotes_subset %>% ggplot(.,aes(x=kingdom,y=genome_size_mb,color=kingdom)) +
geom_point(alpha=0.3) + geom_boxplot(alpha=0.5)
eukaryotes_subset %>% filter(genome_size_mb >0 & cds > 0) %>% ggplot(.,aes(x=cds,y=genome_size_mb,color=kingdom, size=cds)) +
geom_point(alpha=0.5) -> p
p
library(plotly)
library(cowplot)
##
## ********************************************************
## Note: As of version 1.0.0, cowplot does not change the
## default ggplot2 theme anymore. To recover the previous
## behavior, execute:
## theme_set(theme_cowplot())
## ********************************************************
options(scipen = 999)
eukaryotes_subset %>% mutate(genome_size_mb=round(genome_size_mb,2)) %>% ggplot(.,aes(x=kingdom,y=genome_size_mb,color=kingdom, alpha=0.4,text= paste("organism:",organism,"\ncds:",cds))) +
geom_point() +theme_cowplot(12) -> p
ggplotly(p, tooltip = c("text","x","y")) %>% config(displayModeBar = F)